Basic Statistics

Correlation


In [1]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation

In [2]:
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
df = spark.createDataFrame(data, ["features"])

In [ ]:
Vectors.sparse(4, [(0, 1.0), (3, -2.0)]).toArray()

In [3]:
r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))


Pearson correlation matrix:
DenseMatrix([[ 1.        ,  0.05564149,         nan,  0.40047142],
             [ 0.05564149,  1.        ,         nan,  0.91359586],
             [        nan,         nan,  1.        ,         nan],
             [ 0.40047142,  0.91359586,         nan,  1.        ]])

In [4]:
r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))


Spearman correlation matrix:
DenseMatrix([[ 1.        ,  0.10540926,         nan,  0.4       ],
             [ 0.10540926,  1.        ,         nan,  0.9486833 ],
             [        nan,         nan,  1.        ,         nan],
             [ 0.4       ,  0.9486833 ,         nan,  1.        ]])

Hypothesis testing


In [ ]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest

In [ ]:
data = [(0.0, Vectors.dense(0.5, 10.0)),
        (0.0, Vectors.dense(1.5, 20.0)),
        (1.0, Vectors.dense(1.5, 30.0)),
        (0.0, Vectors.dense(3.5, 30.0)),
        (0.0, Vectors.dense(3.5, 40.0)),
        (1.0, Vectors.dense(3.5, 40.0))]
df = spark.createDataFrame(data, ["label", "features"])

In [ ]:
r = ChiSquareTest.test(df, "features", "label").head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))

In [ ]: